from anndata import AnnData
import scrublet as scr
import pandas as pd
import scanpy as sc
import numpy as np
import rbcde
import scipy
import os
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './figures-N2-joint-manifold/'
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80) # low dpi (dots per inch) yields small inline figures
Create fresh object from the embedded raw data, filter to cells that aren't doublets. Recreate raw, re-filter genes and recompute mitochondrial percentage. The cells are already filtered to exclude ones with fewer than 500 genes and more than 20% mitochondrial percentage from last time around. Store post-gene-minimum raw data in .raw. log(CPM/100 + 1) the data and keep that in .X.
adata = sc.read('endometrium-N1-doublet-removal.h5ad')
adata = AnnData(adata.raw.X, var=adata.raw.var, obs=adata.obs)
adata = adata[[not i for i in adata.obs['is_doublet_propagate']]]
adata.raw = adata.copy()
sc.pp.filter_genes(adata, min_cells=3)
#convert to lower to be species agnostic: human mito start with MT-, mouse with mt-
mito_genes = [name for name in adata.var_names if name.lower().startswith('mt-')]
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
The rest of the analysis will be analogous to the one that was done to obtain the preliminary manifold with the doublets still in it.
Remove cell cycle genes by inverting the object and clustering the gene space, reporting back the cluster with known cycling genes as members.
bdata = adata.copy()
sc.pp.highly_variable_genes(bdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
bdata = bdata[:, bdata.var['highly_variable']]
bdata = bdata.copy().T
sc.pp.scale(bdata, max_value=10)
sc.tl.pca(bdata, svd_solver='arpack')
sc.pl.pca_variance_ratio(bdata, log=True, save='_ccg_identification.pdf')
num_pcs = 20
sc.pp.neighbors(bdata,n_pcs=num_pcs)
sc.tl.umap(bdata)
sc.tl.leiden(bdata, resolution=0.4)
bdata.obs['known_cyclers'] = [i in ['CDK1','MKI67','CCNB2','PCNA'] for i in bdata.obs_names]
sc.pl.umap(bdata,color=['leiden','known_cyclers'],color_map='OrRd',save='_ccg_identification.pdf')
print(bdata.obs.loc[['CDK1','MKI67','CCNB2','PCNA'],'leiden'])
With the cycling genes identified, clean up the plotting folder by moving all related plots to a subfolder.
def MovePlots(plotpattern, subplotdir):
if not os.path.exists(str(sc.settings.figdir)+subplotdir):
os.makedirs(str(sc.settings.figdir)+'/'+subplotdir)
os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'*.* '+str(sc.settings.figdir)+'/'+subplotdir)
MovePlots('ccg_identification','ccg_identification')
Flag cell cycling genes and continue standard pipeline in a helper object until PCA.
adata.var['is_ccg'] = [i in bdata.obs[bdata.obs['leiden']=='8'].index for i in adata.var_names]
#compute HVG and PCA on separate helper object and port the results back to the original one
bdata = adata[:, ~adata.var['is_ccg']]
sc.pp.highly_variable_genes(bdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
for col in ['highly_variable','means', 'dispersions', 'dispersions_norm']:
adata.var[col] = bdata.var[col]
#fill NaNs with False so that subsetting to HVGs is possible
adata.var['highly_variable'].fillna(value=False, inplace=True)
bdata = bdata[:, bdata.var['highly_variable']]
sc.pp.scale(bdata, max_value=10)
sc.tl.pca(bdata, svd_solver='arpack', n_comps=50)
adata.obsm['X_pca'] = bdata.obsm['X_pca'].copy()
adata.uns['pca'] = bdata.uns['pca'].copy()
adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, 50))
adata.varm['PCs'][adata.var['highly_variable']] = bdata.varm['PCs']
sc.pl.pca_variance_ratio(adata, log=True, save='.pdf')
Correct cross-individual batch effects with Harmony.
num_pcs = 20
pca = adata.obsm['X_pca'][:, :num_pcs]
batch = adata.obs['individual']
%load_ext rpy2.ipython
%%R -i pca -i batch -o hem
library(harmony)
library(magrittr)
#this gets "90% of the way" to determinism. better than nothing.
set.seed(1)
hem = HarmonyMatrix(pca, batch, theta=4, verbose=FALSE, do_pca=FALSE)
hem = data.frame(hem)
Compute clustering and UMAP, plot metadata, clustering and individual location UMAPs.
adata.obsm['X_harmony'] = hem.values
sc.pp.neighbors(adata, use_rep='X_harmony')
sc.tl.leiden(adata, resolution = 0.5)
sc.tl.umap(adata)
meta = pd.read_csv('meta.csv',index_col=0)
plotmeta = list(meta.columns)
plotmeta.append('sample')
sc.pl.umap(adata, color=plotmeta, save='.pdf')
sc.pl.umap(adata, color='leiden', save='_clustering.pdf')
sc.pl.umap(adata, color='leiden', legend_loc='on data', save='_clustering_clusnumbers.pdf')
sc.pl.umap(adata, color='location', groups=['EN'], save='_location_EN.pdf')
sc.pl.umap(adata, color='location', groups=['MY'], save='_location_MY.pdf')
Plot canonical markers for ease of annotation.
with open('markers.txt','r') as fid:
markers = np.unique([line.rstrip() for line in fid.readlines()])
#make sure they're in the dataset, and sort them alphabetically for ease of finding things
markers = sorted([item for item in markers if item in adata.var_names])
for i in np.arange(np.ceil(len(markers)/4)):
sc.pl.umap(adata, color=markers[int(i*4):int((i+1)*4)], use_raw=False, save='-markers'+str(int(i+1))+'.pdf', color_map='OrRd')
#goodbye clutter!
MovePlots('markers','markers')
Identify cluster markers via the rank-biserial correlation, an effect size equivalent of the Wilcoxon test. Threshold the coefficient with the literature critical value for a large effect (0.5).
rbcde.RBC(adata, use_raw=False)
degs, plot_dict = rbcde.filter_markers(adata, use_raw=False, thresh=0.5)
It would appear that a cluster has no large effect markers. Compute median effect size (0.3).
degs_med, plot_dict_med = rbcde.filter_markers(adata, use_raw=False, thresh=0.3)
Okay, everything has markers now. Take large effect size for clusters that aren't 4 and 5, and report medium for those two.
lenient = ['4','5']
degs = degs_med.loc[~np.array([i not in lenient and j<0.5 for i,j in zip(degs_med['cluster'], degs_med['RBC'])]),:]
degs.to_csv(str(sc.settings.figdir)+'/cluster_markers_rbc.csv')
for clus in lenient:
plot_dict[clus] = plot_dict_med[clus].copy()
adata_count = AnnData(np.expm1(adata.X), var=adata.var, obs=adata.obs)
for clus in plot_dict:
degs_clus = degs.loc[degs['cluster']==clus,:].copy()
degs_clus['log2_FC'] = 0
degs_clus['percent_cluster'] = 0
degs_clus['percent_rest'] = 0
adata_clus = adata_count[adata.obs['leiden']==clus]
adata_rest = adata_count[[not i for i in adata.obs['leiden']==clus]]
adata_clus = adata_clus[:, degs_clus.index]
adata_rest = adata_rest[:, degs_clus.index]
degs_clus['log2_FC'] = np.asarray(np.log2(np.mean(adata_clus.X,axis=0)/np.mean(adata_rest.X,axis=0))).reshape(-1)
#are you expressed?
adata_clus.X.data = adata_clus.X.data > 0
adata_rest.X.data = adata_rest.X.data > 0
degs_clus['percent_cluster'] = np.asarray(100*np.sum(adata_clus.X,axis=0)/adata_clus.shape[0]).reshape(-1)
degs_clus['percent_rest'] = np.asarray(100*np.sum(adata_rest.X,axis=0)/adata_rest.shape[0]).reshape(-1)
degs_clus.to_csv(str(sc.settings.figdir)+'/cluster_markers_'+clus+'.csv')
sc.pl.dotplot(adata, {clus: plot_dict[clus][:100]}, groupby='leiden', use_raw=False, save='_cluster_markers_'+clus+'.pdf')
#goodbye clutter!
MovePlots('cluster_markers','cluster_markers')
And with that, think we're good. Save the object for later.
adata.write('endometrium-N2-10x.h5ad')